#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(VennDiagram) ; library(venneuler)
library(knitr) ; library(expss)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]

# Gupta
load('./../../../Gupta/AllRegions/Data/preprocessed_data.RData')
DE_info_Gupta = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'significant' = padj<0.05) %>%
          dplyr::select(ID, log2FoldChange, padj, significant)


################################################################################
# Dataset with imputed values
load('./../Data/preprocessed_data_imputed.RData')
datExpr_imp = datExpr %>% data.frame
DE_info_imp = DE_info %>% data.frame
datMeta_imp = datMeta

# Update DE_info with Neuronal information
DE_info_imp = DE_info_imp %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(significant=padj<0.05 & !is.na(padj))


################################################################################
# Original dataset
load('./../Data/preprocessed_data.RData')
datExpr_orig = datExpr %>% data.frame
DE_info_orig = DE_info %>% data.frame
datMeta_orig = datMeta

# Update DE_info with Neuronal information
DE_info_orig = DE_info_orig %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(significant=padj<0.05 & !is.na(padj))


# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)


rm(GO_annotations, datExpr, DE_info, datGenes, datMeta, dds, mart, getinfo)


DE Genes


all_genes = unique(c(rownames(datExpr_orig), rownames(datExpr_imp)))

genes_df = data.frame('Original' = all_genes %in% DE_info_orig$ID[DE_info_orig$significant],
                      'Imputed' = all_genes %in% DE_info_imp$ID[DE_info_imp$significant],
                      'Gupta' = all_genes %in% DE_info_Gupta$ID[DE_info_Gupta$significant])
rownames(genes_df) = all_genes

table_info = genes_df %>% apply_labels(Original = 'Original', Imputed = 'Imputed')

cro(table_info$Original, list(table_info$Imputed, total()))
 Imputed     #Total 
 FALSE   TRUE   
 Original 
   FALSE  11391 257   11648
   TRUE  251 4248   4499
   #Total cases  11642 4505   16147
rm(table_info, all_genes)
grid.newpage()
grid.draw(draw.pairwise.venn(sum(genes_df$Original), sum(genes_df$Imputed), sum(genes_df$Original & genes_df$Imputed),
          category = c('Original', 'Imputed'), fill = c('#e6b800', '#0099cc'),
          fontfamily = rep('sans-serif',3), alpha = rep(0.25,2), lty = rep('blank', 2),
          cat.fontfamily = rep('sans-serif',2)))


Comparison to Gupta’s DEA


To see if the genes that are losing/gaining the label of DE have a true biological signal I’m going to look them up in Gupta’s list of DE genes.

Any match between datasets is considered to be evidence of the true DE of the gene, since it would be unlikely for a gene to be identified as DE in two different datasets because of unrelated technical problems.


Lost DE genes

It’s not necessarily bad to lose DE genes, it would only be a problem if these genes truly contained biological signals related to ASD.

cat(paste0(sum(genes_df$Gupta[genes_df$Original & !genes_df$Imputed]), '/', sum(genes_df$Original & !genes_df$Imputed),
           ' of the genes which are no longer DE were identified as DE in Gupta\'s dataset (',
           round(100*mean(genes_df$Gupta[genes_df$Original & !genes_df$Imputed]),2),'%)'))
## 25/251 of the genes which are no longer DE were identified as DE in Gupta's dataset (9.96%)


Gained DE genes

Similarly, even though it would seem a positive thing to gain new DE genes, it’s only good when the genes contain real biological signal related to ASD.

cat(paste0(sum(genes_df$Gupta[!genes_df$Original & genes_df$Imputed]), '/', sum(!genes_df$Original & genes_df$Imputed),
           ' of the genes which are now DE are also considered to be DE in Gupta\'s dataset (',
           round(100*mean(genes_df$Gupta[!genes_df$Original & genes_df$Imputed]),2),'%)'))
## 15/257 of the genes which are now DE are also considered to be DE in Gupta's dataset (5.84%)


The matches to Gupta’s dataset are quite small, so they may not be very robust, but it seems like we are losing more biological information than we are gaining from this new preprocessing pipeline!

I’m going to try to understand what happend to the gained/lost genes that are also in Gupta’s dataset to see if I can understand where the problem is


datExpr_raw = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datExpr_raw = datExpr_raw[rownames(datExpr_raw) %in% rownames(datExpr_imp), colnames(datExpr_raw) %in% colnames(datExpr_imp)]

z_scores = z_scores = datExpr_raw %>% apply(1, function(x) abs(x-mean(x))/sd(x)) %>% t %>% data.frame

max_z_scores = data.frame('ID' = rownames(datExpr_raw),
                          'max_z_score' = apply(z_scores, 1, max)) %>%
               mutate('n_outliers' = apply(z_scores, 1, function(x) sum(x > mean(max_z_score) + 3*sd(max_z_score))))

outliers = which(max_z_scores$n_outliers>0)

max_z_scores$imputed_values = 0

datExpr = datExpr_raw
datMeta = datMeta_imp

for(i in outliers){
  max_score = abs(max(abs(datExpr[i,]-mean(datExpr[i,]))/sd(datExpr[i,]))-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)
  
  while(max_score > 3){
    sample = gsub('X','',names(which.max(abs(datExpr[i,]-mean(datExpr[i,]))/sd(datExpr[i,]))))
    diagnosis = datMeta$Diagnosis[datMeta$Dissected_Sample_ID == sample] %>% as.character
    columns = datMeta$Dissected_Sample_ID[datMeta$Diagnosis == diagnosis & datMeta$Dissected_Sample_ID != sample]
    datExpr[i,paste0('X',sample)] = datExpr[i, paste0('X',columns)] %>% mean %>% round
    
    max_z_scores$imputed_values[i] = max_z_scores$imputed_values[i] + 1
    
    # Prepare for next round
    max_score = abs(max(abs(datExpr[i,]-mean(datExpr[i,]))/sd(datExpr[i,]))-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)
  }
}

genes_df = genes_df %>% mutate('ID' = rownames(datExpr_imp)) %>% left_join(max_z_scores, by = 'ID')

rm(datExpr, datMeta, z_scores, i, outliers, max_score, sample, diagnosis, columns, datExpr_raw)



Lost Genes

lost_genes = genes_df %>% filter(Original & !Imputed & Gupta) %>% left_join(gene_names, by = c('ID'='ensembl_gene_id'))

cat(paste0('Of the ', nrow(lost_genes), ' genes we lost from Gupta\'s DE genes, ',
           sum(lost_genes$imputed_values>0), ' had had their max value imputed, which means that the other ',
           sum(lost_genes$imputed_values==0), ' genes lost their DE tag because of the samples we identified as outliers and removed.'))
## Of the 25 genes we lost from Gupta's DE genes, 2 had had their max value imputed, which means that the other 23 genes lost their DE tag because of the samples we identified as outliers and removed.
plot_function = function(genes_list, i){
  
  i = 3*i-2
  last_j = 3
  plot_list = list()
  
  for(j in 1:last_j){
    
    ID = genes_list$ID[i+j-1]
    
    if(!is.na(ID)){
      plot_data = data.frame('Sample' = colnames(datExpr_raw), 'expr' = datExpr_raw[ID,] %>% t, 'Diagnosis' = datMeta_orig$Diagnosis,
                             'removed_sample' = !colnames(datExpr_raw) %in% colnames(datExpr_imp)) %>%
                  mutate('shape' = ifelse(removed_sample, 4,16), Sample = as.character(Sample)) %>%
                  mutate('alpha' = ifelse(shape==16,0.6,1))
      colnames(plot_data)[2] = 'expr'
      
      # See if any value was imputed
      datExpr_filtered = datExpr_raw[,colnames(datExpr_raw) %in% colnames(datExpr_imp)]
      if(genes_list$imputed_values[i+j-1] > 0){
        
        # Calculate imputed value
        sample = gsub('X','',names(which.max(datExpr_filtered[ID,])))
        diagnosis = datMeta_imp$Diagnosis[datMeta_imp$Dissected_Sample_ID == sample] %>% as.character
        columns = datMeta_imp$Dissected_Sample_ID[datMeta_imp$Diagnosis == diagnosis & datMeta_imp$Dissected_Sample_ID != sample]
        imp_value = datExpr_filtered[ID, paste0('X',columns)] %>% mean %>% round
        
        # Add imputed value to plot_data
        new_row = c(paste0('X',sample), imp_value, diagnosis, FALSE, 16)
        plot_data = rbind(plot_data, new_row)
        plot_data = plot_data %>% mutate(shape = as.numeric(ifelse(Sample == paste0('X',sample), 18, shape)),
                                         expr = as.numeric(expr), Sample = as.factor(Sample),
                                         alpha = ifelse(shape==16,0.6,1))
      }
      
    # Create plot
      ggp = plot_data %>% ggplot(aes(Sample, expr+1, color=Diagnosis)) + 
                          geom_hline(yintercept = median(plot_data$expr[plot_data$Diagnosis=='ASD'])+1, color='#00BFC4') +
                          geom_hline(yintercept = median(plot_data$expr[plot_data$Diagnosis=='CTL'])+1, color='#F8766D') +
                          geom_point(alpha = plot_data$alpha, shape = plot_data$shape, lwd = plot_data$alpha*2) +
                          theme_minimal() + scale_y_log10() +
                          theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank())
      
      if(sum(plot_data$removed_sample==TRUE)>0){
        removed_samples = plot_data %>% dplyr::filter(removed_sample==TRUE) %>% 
                          mutate(x = Sample, xend = Sample, y = expr, yend=median(plot_data$expr[plot_data$Diagnosis==Diagnosis]+1),
                                 color = ifelse(Diagnosis=='ASD','#00BFC4','#F8766D'))
        
        ggp = ggp + geom_segment(data = removed_samples, aes(x=Sample, y=expr, xend=Sample, yend = yend),
                                 color = removed_samples$color, linetype = 'dotted')

        
      }
      
      if(genes_list$imputed_values[i+j-1] > 0) {
        ggp = ggp + geom_segment(aes(x=paste0('X',sample), y=expr[Sample==paste0('X',sample)][1], 
                                                 xend = paste0('X',sample), yend = expr[Sample==paste0('X',sample)][2]),
                                             alpha = 0.5, color = ifelse(diagnosis=='ASD','#00BFC4','#F8766D'))
      }
      plot_list[[j]] = ggplotly(ggp)
      
    } else last_j = j-1
  }
  
  annotations = list()
  for(j in 1:last_j) {
    annotations[[j]] = list(x = 0.1+1/last_j*(j-1) , y = 1.05, text = genes_list$Gene[i+j-1],  
                            showarrow = F, xref='paper', yref='paper')
  }
  
  p = subplot(plot_list, nrows=1) %>% layout(annotations = annotations)
  
  return(p)
}

Lost genes with imputed values

datExpr_raw = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datExpr_raw = datExpr_raw[rownames(datExpr_raw) %in% rownames(datExpr_imp), colnames(datExpr_raw) %in% colnames(datExpr_orig)]

genes_list = lost_genes %>% filter(imputed_values>0) %>% dplyr::rename('Gene' = external_gene_id)

plot_function(genes_list, 1)
rm(genes_list)

Lost genes without imputed values

These genes lost their DE tag because of the removed samples

Their adjusted p-value is still small but no longer below 0.05

genes_list = lost_genes %>% filter(imputed_values==0) %>%
                  left_join(DE_info_imp %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
                  dplyr::rename('Gene' = external_gene_id, 'LFC_now' = log2FoldChange, 'pval_now' = padj) %>%
                  left_join(DE_info_orig %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
                  dplyr::rename('LFC_before' = log2FoldChange, 'pval_before' = padj)

kable(genes_list %>% dplyr::select(Gene, LFC_before, LFC_now, pval_before, pval_now) %>% dplyr::arrange(desc(pval_now)))
Gene LFC_before LFC_now pval_before pval_now
LAIR1 0.2407547 0.1936272 0.0355348 0.0951845
DYSF -0.1874271 -0.1620711 0.0410731 0.0840487
LGALS9 0.2627449 0.2329812 0.0330231 0.0811740
CACNA1B -0.0802029 -0.0596707 0.0186024 0.0769917
GPM6B 0.0910740 0.0804009 0.0417699 0.0700452
ORMDL2 0.1870408 0.1761004 0.0439827 0.0693773
MCTP1 -0.1367840 -0.1325573 0.0494328 0.0687068
ADCY9 -0.0795976 -0.0722156 0.0372166 0.0673555
YPEL4 -0.1097279 -0.1005642 0.0417910 0.0624265
C1QA 0.4105865 0.3449752 0.0307228 0.0612419
LASP1 0.0759806 0.0666278 0.0295123 0.0611956
CACNA2D2 -0.1737752 -0.1479798 0.0286820 0.0585834
SETD1A -0.1160553 -0.1000181 0.0241586 0.0585834
WASF1 -0.1152177 -0.1127594 0.0408826 0.0580328
KLK5 0.4114224 0.4038560 0.0472606 0.0574949
ZMIZ2 -0.1089273 -0.0952768 0.0283087 0.0561784
HIP1R -0.0981093 -0.0857949 0.0266818 0.0548443
TMED5 0.1017375 0.0846044 0.0202803 0.0539410
PRDX1 0.0968230 0.0854619 0.0210837 0.0531598
VAMP5 0.2312376 0.2235125 0.0348144 0.0528330
SBF1 -0.0960837 -0.0941124 0.0485784 0.0517666
ASCC1 0.0868101 0.0819146 0.0423682 0.0514645
EIF4E1B -0.2054563 -0.1823249 0.0269735 0.0512564

It looks like the value belonging to the removed samples contained helpful information

plot_function(genes_list, 1)
plot_function(genes_list, 2)
plot_function(genes_list, 3)
plot_function(genes_list, 4)
plot_function(genes_list, 5)
plot_function(genes_list, 6)
plot_function(genes_list, 7)
plot_function(genes_list, 8)



Gained Genes

gained_genes = genes_df %>% filter(!Original & Imputed & Gupta) %>% left_join(gene_names, by = c('ID'='ensembl_gene_id'))

cat(paste0('Of the ', nrow(gained_genes), ' genes we gained from Gupta\'s DE genes, ',
           sum(gained_genes$imputed_values>0), ' had had their max value imputed, which means that the other ',
           sum(gained_genes$imputed_values==0), ' genes gained their DE tag because of the samples we identified as outliers and removed.'))
## Of the 15 genes we gained from Gupta's DE genes, 0 had had their max value imputed, which means that the other 15 genes gained their DE tag because of the samples we identified as outliers and removed.

Gained genes without imputed values

These genes gained their DE tag because of the removed samples

Their adjusted p-value is were small from the beginning but above 0.05

genes_list = gained_genes %>% filter(imputed_values==0) %>% 
             left_join(DE_info_imp %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
             dplyr::rename('Gene' = external_gene_id, 'LFC_now' = log2FoldChange, 'pval_now' = padj) %>%
             left_join(DE_info_orig %>% dplyr::select(ID, log2FoldChange, padj), by = 'ID') %>%
             dplyr::rename('LFC_before' = log2FoldChange, 'pval_before' = padj)

kable(genes_list %>% dplyr::select(Gene, LFC_before, LFC_now, pval_before, pval_now) %>% dplyr::arrange(desc(pval_before)))
Gene LFC_before LFC_now pval_before pval_now
VKORC1L1 -0.0548949 -0.0659234 0.1033499 0.0388468
PPP6R1 -0.0677603 -0.0737994 0.0856979 0.0390671
SLC30A4 -0.0959336 -0.1088998 0.0743135 0.0404671
DDA1 -0.0779904 -0.0851168 0.0717262 0.0455727
POLR2E -0.0782448 -0.0873556 0.0714477 0.0395353
FBXW5 -0.0860361 -0.0928759 0.0694332 0.0463127
MAL2 -0.1251983 -0.1405410 0.0648019 0.0402099
ARL2 -0.0995651 -0.1063787 0.0576566 0.0487203
SLC25A22 -0.1155208 -0.1193772 0.0553641 0.0445098
FBXW7 -0.1133259 -0.1229711 0.0552993 0.0430954
EEPD1 -0.1259338 -0.1321094 0.0541533 0.0379829
BCL7B -0.0711436 -0.0847214 0.0530242 0.0192720
USP42 0.0817366 0.0929551 0.0529046 0.0280962
FBLL1 -0.1243957 -0.1323026 0.0528176 0.0369741
SEC61A2 -0.0968281 -0.0965677 0.0526990 0.0496727
plot_function(genes_list, 1)
plot_function(genes_list, 2)
plot_function(genes_list, 3)
plot_function(genes_list, 4)
plot_function(genes_list, 5)